{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import gym\n",
    "import os \n",
    "import sys\n",
    "import itertools\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from collections import defaultdict, namedtuple\n",
    "\n",
    "import matplotlib\n",
    "from matplotlib import pyplot as plt\n",
    "%matplotlib inline\n",
    "matplotlib.style.use('ggplot')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n",
      "env.action_sapce: 2\n",
      "env.observation_sapce: 4\n",
      "env.observation_space.high: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]\n",
      "env.observation_space.low: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]\n"
     ]
    }
   ],
   "source": [
    "env = gym.envs.make('CartPole-v0')\n",
    "env = env.unwrapped\n",
    "env.seed(1)\n",
    "\n",
    "print(\"env.action_sapce:\", env.action_space.n)\n",
    "print(\"env.observation_sapce:\", env.observation_space.shape[0])\n",
    "print(\"env.observation_space.high:\", env.observation_space.high)\n",
    "print(\"env.observation_space.low:\", env.observation_space.low)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PolicyGradient():\n",
    "    \"\"\"\n",
    "    Policy Gradient REinforcement Learning.\n",
    "    used a 3 layer neural network as the policy network.\n",
    "    \"\"\"\n",
    "    def __init__(self, n_x, n_y,\n",
    "                learning_rate=0.01, reward_decay=0.95, load_path=None, save_path=None):\n",
    "        self.n_x = n_x\n",
    "        self.n_y = n_y\n",
    "        self.lr = learning_rate\n",
    "        self.reward_decay = reward_decay\n",
    "        \n",
    "        self.episode_states, self.episode_actions, self.episode_rewards = [], [], []\n",
    "        self.cost_history = []\n",
    "        \n",
    "        self.__build_network()\n",
    "        self.sess = tf.Session()\n",
    "        \n",
    "        tf.summary.FileWriter(\"logs/\", self.sess.graph)\n",
    "        self.sess.run(tf.global_variables_initializer())\n",
    "        self.saver = tf.train.Saver()\n",
    "        \n",
    "    def choose_action(self, state):\n",
    "        \"\"\"\n",
    "        choose action base on given state\n",
    "        \"\"\"\n",
    "        # reshape state to (num_features, 1)\n",
    "        state = state[:, np.newaxis]\n",
    "        \n",
    "        # get softmax probabilities\n",
    "        prob_weights = self.sess.run(self.outputs_softmax, feed_dict={self.X: state})\n",
    "        \n",
    "        # return sampled action\n",
    "        action = np.random.choice(range(len(prob_weights.ravel())), p=prob_weights.ravel())\n",
    "        return action\n",
    "    \n",
    "    def store_transition(self, state, action, reward):\n",
    "        \"\"\"\n",
    "        Store game memory for network training\n",
    "        \"\"\"\n",
    "        self.episode_states.append(state)\n",
    "        self.episode_rewards.append(reward)\n",
    "        \n",
    "        action__ = np.zeros(self.n_y)\n",
    "        action__[action] = 1\n",
    "        self.episode_actions.append(action__)\n",
    "        \n",
    "    def learn(self):\n",
    "        \"\"\"\n",
    "        Accroding the game memory traing the network\n",
    "        \"\"\"\n",
    "        # discount and normalize episode reward\n",
    "        disc_norm_ep_reward = self.__disc_and_norm_rewards()\n",
    "        \n",
    "        # train on episodes\n",
    "        self.sess.run(self.train_op, feed_dict={\n",
    "            self.X: np.vstack(self.episode_states).T,\n",
    "            self.Y: np.vstack(self.episode_actions).T,\n",
    "            self.disc_norm_ep_reward: disc_norm_ep_reward,  \n",
    "        })\n",
    "        \n",
    "        # Reset the episode data\n",
    "        self.episode_states, self.episode_actions, self.episode_rewards  = [], [], []\n",
    "        \n",
    "        return disc_norm_ep_reward\n",
    "        \n",
    "    def __build_network(self):\n",
    "        \"\"\"\n",
    "        build the natural network\n",
    "        \"\"\"\n",
    "        # Create placeholders\n",
    "        with tf.name_scope('inputs'):\n",
    "            self.X = tf.placeholder(tf.float32, shape=(self.n_x, None), name=\"X\")\n",
    "            self.Y = tf.placeholder(tf.float32, shape=(self.n_y, None), name=\"Y\")\n",
    "            self.disc_norm_ep_reward = tf.placeholder(tf.float32, [None, ], name=\"actions_value\")\n",
    "\n",
    "        layer1_units = 10\n",
    "        layer2_units = 10\n",
    "        layer_output_units = self.n_y\n",
    "        \n",
    "        with tf.name_scope(\"parameter\"):\n",
    "            W1 = self.__weigfht_variable([layer1_units, self.n_x], \"W1\")\n",
    "            b1 = self.__bias_bariable([layer1_units, 1], \"b1\")\n",
    "            W2 = self.__weigfht_variable([layer2_units, layer1_units], \"W2\")\n",
    "            b2 = self.__bias_bariable([layer2_units, 1], \"b2\")\n",
    "            W3 = self.__weigfht_variable([self.n_y, layer2_units], \"W3\")\n",
    "            b3 = self.__bias_bariable([self.n_y, 1], \"b3\")\n",
    "        \n",
    "        with tf.name_scope(\"layer1\"):\n",
    "            z1 = tf.add(tf.matmul(W1, self.X), b1)\n",
    "            a1 = tf.nn.relu(z1)\n",
    "        with tf.name_scope(\"layer2\"):\n",
    "            z2 = tf.add(tf.matmul(W2, a1), b2)\n",
    "            a2 = tf.nn.relu(z2)\n",
    "        with tf.name_scope(\"layer_output\"):\n",
    "            z3 = tf.add(tf.matmul(W3, a2), b3)\n",
    "            a3 = tf.nn.softmax(z3)\n",
    "\n",
    "        # Softmax outputs, we need to transpose as tensorflow nn functions expects them in this shape\n",
    "        logits = tf.transpose(z3)\n",
    "        labels = tf.transpose(self.Y)\n",
    "        self.outputs_softmax = tf.nn.softmax(logits, name='A3')\n",
    "\n",
    "        with tf.name_scope('loss'):\n",
    "            neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels)\n",
    "            loss = tf.reduce_mean(neg_log_prob * self.disc_norm_ep_reward)  # reward guided loss\n",
    "\n",
    "        with tf.name_scope('train'):\n",
    "            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)\n",
    "        \n",
    "    def __weigfht_variable(self, shape, name):\n",
    "        initial = tf.contrib.layers.xavier_initializer(seed=1)\n",
    "        return tf.get_variable(name, shape, initializer=initial)\n",
    "    \n",
    "    def __bias_bariable(self, shape, name):\n",
    "        initial = tf.contrib.layers.xavier_initializer(seed=1)\n",
    "        return tf.get_variable(name, shape, initializer=initial)\n",
    "        \n",
    "    def __disc_and_norm_rewards(self):   \n",
    "        disc_norm_ep_rewards = np.zeros_like(self.episode_rewards)\n",
    "        c = 0\n",
    "        for t in reversed(range(len(self.episode_rewards))):\n",
    "            c = c * self.reward_decay + self.episode_rewards[t]\n",
    "            disc_norm_ep_rewards[t] = c\n",
    "\n",
    "        disc_norm_ep_rewards -= np.mean(disc_norm_ep_rewards)\n",
    "        disc_norm_ep_rewards /= np.std(disc_norm_ep_rewards)\n",
    "        return disc_norm_ep_rewards"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Monte_Carlo_Policy_Gradient():\n",
    "    \"\"\"\n",
    "    Monte Carlo Policy Gradient method class\n",
    "    \"\"\"\n",
    "    def __init__(self, env, num_episodes=200, learning_rate=0.01, reward_decay=0.95):\n",
    "        \n",
    "        self.nA = env.action_space.n\n",
    "        self.nS = env.observation_space.shape[0]\n",
    "        self.env = env\n",
    "        self.num_episodes = num_episodes\n",
    "        self.reward_decay = reward_decay\n",
    "        self.learning_rate = learning_rate\n",
    "        self.rewards = []\n",
    "        self.RENDER_REWARD_MIN = 50\n",
    "        self.RENDER_ENV = False\n",
    "        self.PG = PolicyGradient(n_x=self.nS, n_y=self.nA, \n",
    "                                 learning_rate=self.learning_rate,\n",
    "                                 reward_decay=self.reward_decay)\n",
    "        \n",
    "        # keep track of useful statistic\n",
    "        record_head = namedtuple(\"Stats\", [\"episode_lengths\",\"episode_rewards\"])\n",
    "        self.record = record_head(\n",
    "                                episode_lengths = np.zeros(num_episodes),\n",
    "                                episode_rewards = np.zeros(num_episodes))\n",
    "        \n",
    "    def mcpg_learn(self):\n",
    "        \"\"\"\n",
    "        Monte Carlo Policy Gradient method core code. \n",
    "        \"\"\"\n",
    "        for i_episode in range(self.num_episodes):\n",
    "            # print the number iter episode\n",
    "            num_present = (i_episode+1) / self.num_episodes\n",
    "            print(\"Episode {}/{}\".format(i_episode + 1, self.num_episodes)) # end=\"\"\n",
    "            print(\"=\" * round(num_present*60))\n",
    "        \n",
    "            # Reset the environment and pick the first action\n",
    "            state = env.reset()\n",
    "            reward = 0\n",
    "            \n",
    "            # One step in the environemt, replace code(while(True))\n",
    "            for t in itertools.count():\n",
    "                if self.RENDER_ENV: env.render()\n",
    "                \n",
    "                # step1: choose an action basoed on state\n",
    "                action = self.PG.choose_action(state)\n",
    "                \n",
    "                # step2: take action in the environment\n",
    "                next_state, reward, done, _ = env.step(action)\n",
    "                \n",
    "                # step3: store transition for training\n",
    "                self.PG.store_transition(state, action, reward)\n",
    "                \n",
    "                # update statistics\n",
    "                self.record.episode_rewards[i_episode] += reward\n",
    "                self.record.episode_lengths[i_episode] = t\n",
    "                \n",
    "                if done:\n",
    "                    episode_rewards_sum = sum(self.PG.episode_rewards)\n",
    "                    self.rewards.append(episode_rewards_sum)\n",
    "                    max_reward = np.amax(self.rewards)\n",
    "                    \n",
    "                    # step4: end of episode tran the PG network\n",
    "                    _  = self.PG.learn()\n",
    "                                    \n",
    "                    print(\"reward:{}, max reward:{}, episode len:{}\\n\".format(episode_rewards_sum, max_reward, t))\n",
    "                    if max_reward > self.RENDER_REWARD_MIN: self.RENDER_ENV = True\n",
    "                    break\n",
    "                    \n",
    "                # step5: save new state\n",
    "                state = next_state\n",
    "        \n",
    "        return self.record"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Episode 1/200\n",
      "\n",
      "reward:19.0, max reward:19.0, episode len:18\n",
      "\n",
      "Episode 2/200\n",
      "=\n",
      "reward:10.0, max reward:19.0, episode len:9\n",
      "\n",
      "Episode 3/200\n",
      "=\n",
      "reward:13.0, max reward:19.0, episode len:12\n",
      "\n",
      "Episode 4/200\n",
      "=\n",
      "reward:20.0, max reward:20.0, episode len:19\n",
      "\n",
      "Episode 5/200\n",
      "==\n",
      "reward:13.0, max reward:20.0, episode len:12\n",
      "\n",
      "Episode 6/200\n",
      "==\n",
      "reward:11.0, max reward:20.0, episode len:10\n",
      "\n",
      "Episode 7/200\n",
      "==\n",
      "reward:14.0, max reward:20.0, episode len:13\n",
      "\n",
      "Episode 8/200\n",
      "==\n",
      "reward:16.0, max reward:20.0, episode len:15\n",
      "\n",
      "Episode 9/200\n",
      "===\n",
      "reward:22.0, max reward:22.0, episode len:21\n",
      "\n",
      "Episode 10/200\n",
      "===\n",
      "reward:39.0, max reward:39.0, episode len:38\n",
      "\n",
      "Episode 11/200\n",
      "===\n",
      "reward:30.0, max reward:39.0, episode len:29\n",
      "\n",
      "Episode 12/200\n",
      "====\n",
      "reward:13.0, max reward:39.0, episode len:12\n",
      "\n",
      "Episode 13/200\n",
      "====\n",
      "reward:9.0, max reward:39.0, episode len:8\n",
      "\n",
      "Episode 14/200\n",
      "====\n",
      "reward:13.0, max reward:39.0, episode len:12\n",
      "\n",
      "Episode 15/200\n",
      "====\n",
      "reward:12.0, max reward:39.0, episode len:11\n",
      "\n",
      "Episode 16/200\n",
      "=====\n",
      "reward:15.0, max reward:39.0, episode len:14\n",
      "\n",
      "Episode 17/200\n",
      "=====\n",
      "reward:25.0, max reward:39.0, episode len:24\n",
      "\n",
      "Episode 18/200\n",
      "=====\n",
      "reward:91.0, max reward:91.0, episode len:90\n",
      "\n",
      "Episode 19/200\n",
      "======\n",
      "reward:32.0, max reward:91.0, episode len:31\n",
      "\n",
      "Episode 20/200\n",
      "======\n",
      "reward:30.0, max reward:91.0, episode len:29\n",
      "\n",
      "Episode 21/200\n",
      "======\n",
      "reward:25.0, max reward:91.0, episode len:24\n",
      "\n",
      "Episode 22/200\n",
      "=======\n",
      "reward:12.0, max reward:91.0, episode len:11\n",
      "\n",
      "Episode 23/200\n",
      "=======\n",
      "reward:17.0, max reward:91.0, episode len:16\n",
      "\n",
      "Episode 24/200\n",
      "=======\n",
      "reward:32.0, max reward:91.0, episode len:31\n",
      "\n",
      "Episode 25/200\n",
      "========\n",
      "reward:15.0, max reward:91.0, episode len:14\n",
      "\n",
      "Episode 26/200\n",
      "========\n",
      "reward:24.0, max reward:91.0, episode len:23\n",
      "\n",
      "Episode 27/200\n",
      "========\n",
      "reward:36.0, max reward:91.0, episode len:35\n",
      "\n",
      "Episode 28/200\n",
      "========\n",
      "reward:35.0, max reward:91.0, episode len:34\n",
      "\n",
      "Episode 29/200\n",
      "=========\n",
      "reward:36.0, max reward:91.0, episode len:35\n",
      "\n",
      "Episode 30/200\n",
      "=========\n",
      "reward:14.0, max reward:91.0, episode len:13\n",
      "\n",
      "Episode 31/200\n",
      "=========\n",
      "reward:15.0, max reward:91.0, episode len:14\n",
      "\n",
      "Episode 32/200\n",
      "==========\n",
      "reward:47.0, max reward:91.0, episode len:46\n",
      "\n",
      "Episode 33/200\n",
      "==========\n",
      "reward:39.0, max reward:91.0, episode len:38\n",
      "\n",
      "Episode 34/200\n",
      "==========\n",
      "reward:18.0, max reward:91.0, episode len:17\n",
      "\n",
      "Episode 35/200\n",
      "==========\n",
      "reward:21.0, max reward:91.0, episode len:20\n",
      "\n",
      "Episode 36/200\n",
      "===========\n",
      "reward:10.0, max reward:91.0, episode len:9\n",
      "\n",
      "Episode 37/200\n",
      "===========\n",
      "reward:23.0, max reward:91.0, episode len:22\n",
      "\n",
      "Episode 38/200\n",
      "===========\n",
      "reward:21.0, max reward:91.0, episode len:20\n",
      "\n",
      "Episode 39/200\n",
      "============\n",
      "reward:12.0, max reward:91.0, episode len:11\n",
      "\n",
      "Episode 40/200\n",
      "============\n",
      "reward:30.0, max reward:91.0, episode len:29\n",
      "\n",
      "Episode 41/200\n",
      "============\n",
      "reward:12.0, max reward:91.0, episode len:11\n",
      "\n",
      "Episode 42/200\n",
      "=============\n",
      "reward:28.0, max reward:91.0, episode len:27\n",
      "\n",
      "Episode 43/200\n",
      "=============\n",
      "reward:19.0, max reward:91.0, episode len:18\n",
      "\n",
      "Episode 44/200\n",
      "=============\n",
      "reward:24.0, max reward:91.0, episode len:23\n",
      "\n",
      "Episode 45/200\n",
      "==============\n",
      "reward:9.0, max reward:91.0, episode len:8\n",
      "\n",
      "Episode 46/200\n",
      "==============\n",
      "reward:23.0, max reward:91.0, episode len:22\n",
      "\n",
      "Episode 47/200\n",
      "==============\n",
      "reward:13.0, max reward:91.0, episode len:12\n",
      "\n",
      "Episode 48/200\n",
      "==============\n",
      "reward:21.0, max reward:91.0, episode len:20\n",
      "\n",
      "Episode 49/200\n",
      "===============\n",
      "reward:10.0, max reward:91.0, episode len:9\n",
      "\n",
      "Episode 50/200\n",
      "===============\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-5-7b579d14ec8c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mtf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreset_default_graph\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mmcpg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMonte_Carlo_Policy_Gradient\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0menv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnum_episodes\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m200\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmcpg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmcpg_learn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-4-7ad265720256>\u001b[0m in \u001b[0;36mmcpg_learn\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     40\u001b[0m             \u001b[1;31m# One step in the environemt, replace code(while(True))\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     41\u001b[0m             \u001b[1;32mfor\u001b[0m \u001b[0mt\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mitertools\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 42\u001b[1;33m                 \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRENDER_ENV\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrender\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     43\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     44\u001b[0m                 \u001b[1;31m# step1: choose an action basoed on state\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\gym\\envs\\classic_control\\cartpole.py\u001b[0m in \u001b[0;36mrender\u001b[1;34m(self, mode)\u001b[0m\n\u001b[0;32m    135\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpoletrans\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_rotation\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    136\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 137\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mviewer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrender\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreturn_rgb_array\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmode\u001b[0m\u001b[1;33m==\u001b[0m\u001b[1;34m'rgb_array'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    138\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    139\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\gym\\envs\\classic_control\\rendering.py\u001b[0m in \u001b[0;36mrender\u001b[1;34m(self, return_rgb_array)\u001b[0m\n\u001b[0;32m    103\u001b[0m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwidth\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    104\u001b[0m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 105\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwindow\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    106\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0monetime_geoms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    107\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mreturn_rgb_array\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misopen\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyglet\\window\\win32\\__init__.py\u001b[0m in \u001b[0;36mflip\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    319\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mflip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    320\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdraw_mouse_cursor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    322\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    323\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mset_location\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyglet\\gl\\win32.py\u001b[0m in \u001b[0;36mflip\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    224\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    225\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mflip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 226\u001b[1;33m         \u001b[0m_gdi32\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSwapBuffers\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcanvas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhdc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    227\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    228\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget_vsync\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph() \n",
    "mcpg = Monte_Carlo_Policy_Gradient(env, num_episodes=200)\n",
    "result = mcpg.mcpg_learn()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def plot_episode_stats(stats, smoothing_window=10, noshow=False):\n",
    "    # Plot the episode length over time\n",
    "    fig1 = plt.figure(figsize=(13,5))\n",
    "    plt.plot(stats.episode_lengths)\n",
    "    plt.xlabel(\"Episode\")\n",
    "    plt.ylabel(\"Episode Length\")\n",
    "    plt.title(\"Episode Length over Time\")\n",
    "    if noshow:\n",
    "        plt.close(fig1)\n",
    "    else:\n",
    "        plt.show(fig1)\n",
    "\n",
    "    # Plot the episode reward over time\n",
    "    fig2 = plt.figure(figsize=(13,5))\n",
    "    rewards_smoothed = pd.Series(stats.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()\n",
    "    plt.plot(rewards_smoothed)\n",
    "    plt.xlabel(\"Episode\")\n",
    "    plt.ylabel(\"Episode Reward (Smoothed)\")\n",
    "    plt.title(\"Episode Reward over Time (Smoothed over window size {})\".format(smoothing_window))\n",
    "    if noshow:\n",
    "        plt.close(fig2)\n",
    "    else:\n",
    "        plt.show(fig2)\n",
    "\n",
    "    # Plot time steps and episode number\n",
    "    fig3 = plt.figure(figsize=(13,5))\n",
    "    plt.plot(np.cumsum(stats.episode_lengths), np.arange(len(stats.episode_lengths)))\n",
    "    plt.xlabel(\"Time Steps\")\n",
    "    plt.ylabel(\"Episode\")\n",
    "    plt.title(\"Episode per time step\")\n",
    "    if noshow:\n",
    "        plt.close(fig3)\n",
    "    else:\n",
    "        plt.show(fig3)\n",
    "\n",
    "    return fig1, fig2, fig3\n",
    "\n",
    "plot_episode_stats(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
